{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('./kc_house_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>...</th>\n",
       "      <th>grade</th>\n",
       "      <th>sqft_above</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>zipcode</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>sqft_living15</th>\n",
       "      <th>sqft_lot15</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7129300520</td>\n",
       "      <td>20141013T000000</td>\n",
       "      <td>221900.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1180</td>\n",
       "      <td>5650</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>1180</td>\n",
       "      <td>0</td>\n",
       "      <td>1955</td>\n",
       "      <td>0</td>\n",
       "      <td>98178</td>\n",
       "      <td>47.5112</td>\n",
       "      <td>-122.257</td>\n",
       "      <td>1340</td>\n",
       "      <td>5650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6414100192</td>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>538000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2570</td>\n",
       "      <td>7242</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>2170</td>\n",
       "      <td>400</td>\n",
       "      <td>1951</td>\n",
       "      <td>1991</td>\n",
       "      <td>98125</td>\n",
       "      <td>47.7210</td>\n",
       "      <td>-122.319</td>\n",
       "      <td>1690</td>\n",
       "      <td>7639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5631500400</td>\n",
       "      <td>20150225T000000</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.00</td>\n",
       "      <td>770</td>\n",
       "      <td>10000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>770</td>\n",
       "      <td>0</td>\n",
       "      <td>1933</td>\n",
       "      <td>0</td>\n",
       "      <td>98028</td>\n",
       "      <td>47.7379</td>\n",
       "      <td>-122.233</td>\n",
       "      <td>2720</td>\n",
       "      <td>8062</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2487200875</td>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>604000.0</td>\n",
       "      <td>4</td>\n",
       "      <td>3.00</td>\n",
       "      <td>1960</td>\n",
       "      <td>5000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>1050</td>\n",
       "      <td>910</td>\n",
       "      <td>1965</td>\n",
       "      <td>0</td>\n",
       "      <td>98136</td>\n",
       "      <td>47.5208</td>\n",
       "      <td>-122.393</td>\n",
       "      <td>1360</td>\n",
       "      <td>5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1954400510</td>\n",
       "      <td>20150218T000000</td>\n",
       "      <td>510000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1680</td>\n",
       "      <td>8080</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>8</td>\n",
       "      <td>1680</td>\n",
       "      <td>0</td>\n",
       "      <td>1987</td>\n",
       "      <td>0</td>\n",
       "      <td>98074</td>\n",
       "      <td>47.6168</td>\n",
       "      <td>-122.045</td>\n",
       "      <td>1800</td>\n",
       "      <td>7503</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           id             date     price  bedrooms  bathrooms  sqft_living  \\\n",
       "0  7129300520  20141013T000000  221900.0         3       1.00         1180   \n",
       "1  6414100192  20141209T000000  538000.0         3       2.25         2570   \n",
       "2  5631500400  20150225T000000  180000.0         2       1.00          770   \n",
       "3  2487200875  20141209T000000  604000.0         4       3.00         1960   \n",
       "4  1954400510  20150218T000000  510000.0         3       2.00         1680   \n",
       "\n",
       "   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \\\n",
       "0      5650     1.0           0     0  ...      7        1180              0   \n",
       "1      7242     2.0           0     0  ...      7        2170            400   \n",
       "2     10000     1.0           0     0  ...      6         770              0   \n",
       "3      5000     1.0           0     0  ...      7        1050            910   \n",
       "4      8080     1.0           0     0  ...      8        1680              0   \n",
       "\n",
       "   yr_built  yr_renovated  zipcode      lat     long  sqft_living15  \\\n",
       "0      1955             0    98178  47.5112 -122.257           1340   \n",
       "1      1951          1991    98125  47.7210 -122.319           1690   \n",
       "2      1933             0    98028  47.7379 -122.233           2720   \n",
       "3      1965             0    98136  47.5208 -122.393           1360   \n",
       "4      1987             0    98074  47.6168 -122.045           1800   \n",
       "\n",
       "   sqft_lot15  \n",
       "0        5650  \n",
       "1        7639  \n",
       "2        8062  \n",
       "3        5000  \n",
       "4        7503  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(columns=['id', 'date', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15' ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>grade</th>\n",
       "      <th>sqft_above</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2.161300e+04</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>2.161300e+04</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "      <td>21613.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.400881e+05</td>\n",
       "      <td>3.370842</td>\n",
       "      <td>2.114757</td>\n",
       "      <td>2079.899736</td>\n",
       "      <td>1.510697e+04</td>\n",
       "      <td>1.494309</td>\n",
       "      <td>0.007542</td>\n",
       "      <td>0.234303</td>\n",
       "      <td>3.409430</td>\n",
       "      <td>7.656873</td>\n",
       "      <td>1788.390691</td>\n",
       "      <td>291.509045</td>\n",
       "      <td>1971.005136</td>\n",
       "      <td>84.402258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.671272e+05</td>\n",
       "      <td>0.930062</td>\n",
       "      <td>0.770163</td>\n",
       "      <td>918.440897</td>\n",
       "      <td>4.142051e+04</td>\n",
       "      <td>0.539989</td>\n",
       "      <td>0.086517</td>\n",
       "      <td>0.766318</td>\n",
       "      <td>0.650743</td>\n",
       "      <td>1.175459</td>\n",
       "      <td>828.090978</td>\n",
       "      <td>442.575043</td>\n",
       "      <td>29.373411</td>\n",
       "      <td>401.679240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>7.500000e+04</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>290.000000</td>\n",
       "      <td>5.200000e+02</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>290.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1900.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>3.219500e+05</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>1.750000</td>\n",
       "      <td>1427.000000</td>\n",
       "      <td>5.040000e+03</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>1190.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1951.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.500000e+05</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>2.250000</td>\n",
       "      <td>1910.000000</td>\n",
       "      <td>7.618000e+03</td>\n",
       "      <td>1.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>1560.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1975.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.450000e+05</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2.500000</td>\n",
       "      <td>2550.000000</td>\n",
       "      <td>1.068800e+04</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>2210.000000</td>\n",
       "      <td>560.000000</td>\n",
       "      <td>1997.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>7.700000e+06</td>\n",
       "      <td>33.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>13540.000000</td>\n",
       "      <td>1.651359e+06</td>\n",
       "      <td>3.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>9410.000000</td>\n",
       "      <td>4820.000000</td>\n",
       "      <td>2015.000000</td>\n",
       "      <td>2015.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              price      bedrooms     bathrooms   sqft_living      sqft_lot  \\\n",
       "count  2.161300e+04  21613.000000  21613.000000  21613.000000  2.161300e+04   \n",
       "mean   5.400881e+05      3.370842      2.114757   2079.899736  1.510697e+04   \n",
       "std    3.671272e+05      0.930062      0.770163    918.440897  4.142051e+04   \n",
       "min    7.500000e+04      0.000000      0.000000    290.000000  5.200000e+02   \n",
       "25%    3.219500e+05      3.000000      1.750000   1427.000000  5.040000e+03   \n",
       "50%    4.500000e+05      3.000000      2.250000   1910.000000  7.618000e+03   \n",
       "75%    6.450000e+05      4.000000      2.500000   2550.000000  1.068800e+04   \n",
       "max    7.700000e+06     33.000000      8.000000  13540.000000  1.651359e+06   \n",
       "\n",
       "             floors    waterfront          view     condition         grade  \\\n",
       "count  21613.000000  21613.000000  21613.000000  21613.000000  21613.000000   \n",
       "mean       1.494309      0.007542      0.234303      3.409430      7.656873   \n",
       "std        0.539989      0.086517      0.766318      0.650743      1.175459   \n",
       "min        1.000000      0.000000      0.000000      1.000000      1.000000   \n",
       "25%        1.000000      0.000000      0.000000      3.000000      7.000000   \n",
       "50%        1.500000      0.000000      0.000000      3.000000      7.000000   \n",
       "75%        2.000000      0.000000      0.000000      4.000000      8.000000   \n",
       "max        3.500000      1.000000      4.000000      5.000000     13.000000   \n",
       "\n",
       "         sqft_above  sqft_basement      yr_built  yr_renovated  \n",
       "count  21613.000000   21613.000000  21613.000000  21613.000000  \n",
       "mean    1788.390691     291.509045   1971.005136     84.402258  \n",
       "std      828.090978     442.575043     29.373411    401.679240  \n",
       "min      290.000000       0.000000   1900.000000      0.000000  \n",
       "25%     1190.000000       0.000000   1951.000000      0.000000  \n",
       "50%     1560.000000       0.000000   1975.000000      0.000000  \n",
       "75%     2210.000000     560.000000   1997.000000      0.000000  \n",
       "max     9410.000000    4820.000000   2015.000000   2015.000000  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       20699\n",
       "2014       91\n",
       "2013       37\n",
       "2003       36\n",
       "2000       35\n",
       "        ...  \n",
       "1934        1\n",
       "1959        1\n",
       "1951        1\n",
       "1948        1\n",
       "1944        1\n",
       "Name: yr_renovated, Length: 70, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"yr_renovated\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Coefficients: \n",
      " [-4.32662730e+04  4.86167386e+04  1.12142142e+02 -2.68282264e-01\n",
      "  2.26383921e+04  5.86529369e+05  4.46321576e+04  1.92295623e+04\n",
      "  1.22589145e+05  5.74229299e+01  5.47192120e+01 -3.52030825e+03\n",
      "  1.18792551e+01]\n",
      "Mean squared error: 52470226955.91\n",
      "Coefficient of determination: 0.65\n"
     ]
    }
   ],
   "source": [
    "x = data.drop(['price'],axis=1)\n",
    "y = data['price']\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y)\n",
    "reg = LinearRegression().fit(x_train, y_train)\n",
    "\n",
    "#exporting model\n",
    "pickle.dump(reg,open(\"model.pkl\",\"wb\"))\n",
    "\n",
    "y_prediction = reg.predict(x_test)\n",
    "\n",
    "\n",
    "print('Coefficients: \\n', reg.coef_)\n",
    "# The mean squared error\n",
    "print('Mean squared error: %.2f'\n",
    "      % mean_squared_error(y_test, y_prediction))\n",
    "# The coefficient of determination: 1 is perfect prediction\n",
    "print('Coefficient of determination: %.2f'\n",
    "      % r2_score(y_test, y_prediction))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([776954.62504178])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg.predict(data.drop(['price'],axis=1).iloc[99].values.reshape(1, -1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3.0000e+00, 2.5000e+00, 2.3200e+03, 3.6847e+04, 2.0000e+00,\n",
       "        0.0000e+00, 2.0000e+00, 3.0000e+00, 9.0000e+00, 2.3200e+03,\n",
       "        0.0000e+00, 1.9920e+03, 0.0000e+00]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop(['price'],axis=1).iloc[99].values.reshape(1, -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
